#setwd("/home/gbakie/neu/stat-sp16/project/Online_News_Popularity")
setwd("/Users/Darshan/Documents/Online_News_Popularity")

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
source("DataPreprocess.R")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Warning: package 'car' was built under R version 3.2.4
set.seed(464)

setwd("/Users/Darshan/Documents/CS 7280 Stats/Project/Data/")
#setwd("/home/gbakie/neu/stat-sp16/project/data")

news <- read.csv("Train.csv", header = TRUE)

news <- data_cleaning(news)
news <- correlation_cleaning(news)

# Can not apply transformation on weighted regression

return_obj <- target_transformation(news)
news <- return_obj$news
lamda <- return_obj$lambda

obj <- normalization(news)
news <- obj$news

news <- cat_encoding(news)

url <- news$url
news$url <- NULL

categorical_var <- c("data_channel_is_lifestyle", 
                     "data_channel_is_entertainment", "data_channel_is_bus", 
                     "data_channel_is_world", "data_channel_is_socmed", 
                     "data_channel_is_tech", "weekday_is_monday", "weekday_is_tuesday", 
                     "weekday_is_wednesday", "weekday_is_thursday", "weekday_is_friday", 
                     "weekday_is_saturday", "weekday_is_sunday")

news_with_cat <- subset(news, select = categorical_var)

news <- subset(news, select = setdiff(names(news),categorical_var))

#news <- cook_outliers_removal(news)

ignored_column_names <- c("url", "timedelta", "data_channel_is_lifestyle",
                          "data_channel_is_entertainment", "data_channel_is_bus",
                          "data_channel_is_world", "data_channel_is_socmed",
                          "data_channel_is_tech", "weekday_is_monday", "weekday_is_tuesday", "weekday_is_wednesday", "weekday_is_thursday", "weekday_is_friday",
                          "weekday_is_saturday", "weekday_is_sunday", "is_weekend", "shares", "data_channel", "cat_dow")

column_names <- names(news)
needed_columns <- setdiff(column_names,ignored_column_names)

model <- lm(shares ~ data_channel +
             cat_dow +
             i_kw_max_avg_avg +
             self_reference_avg_sharess +
             i_kw_avg_max_max +
             num_hrefs +
             global_subjectivity +
             LDA_00 +
             LDA_01 +
             LDA_02 +
             num_self_hrefs +
             i_n_unique_tokens_content +
             i_title_subjectivity_sentiment_polarity +
             abs_title_subjectivity +
             n_tokens_title +
             min_positive_polarity +
             num_imgs +
             average_token_length +
             title_sentiment_polarity + 
             i_min_avg_negative_pol, data=news)

for(column in needed_columns){
  print(column)
  p <- ggplot(aes_string(x=column,y=model$residuals) ,data=news) + geom_point() + stat_smooth()
  plot(p)
}
## [1] "n_tokens_title"

## [1] "num_hrefs"

## [1] "num_self_hrefs"

## [1] "num_imgs"

## [1] "num_videos"

## [1] "average_token_length"

## [1] "num_keywords"

## [1] "kw_min_avg"

## [1] "self_reference_avg_sharess"

## [1] "LDA_00"

## [1] "LDA_01"

## [1] "LDA_02"

## [1] "LDA_03"

## [1] "LDA_04"

## [1] "global_subjectivity"

## [1] "global_rate_positive_words"

## [1] "global_rate_negative_words"

## [1] "avg_positive_polarity"

## [1] "min_positive_polarity"

## [1] "max_positive_polarity"

## [1] "max_negative_polarity"

## [1] "title_sentiment_polarity"

## [1] "abs_title_subjectivity"

## [1] "i_n_unique_tokens_content"

## [1] "i_title_subjectivity_sentiment_polarity"

## [1] "i_min_avg_negative_pol"

## [1] "i_rate_pos_glob_sent_polarity"

## [1] "i_kw_max_avg_min"

## [1] "i_kw_max_avg_avg"

## [1] "i_kw_avg_max_max"